#This script combines the labelled forms for Egypt human labelled data
#It processes them by dropping irrelevant columns and rows
#It removes rows labelled as "not relevant"
library(dplyr)
library(tidylog)
library(data.table)
library(tidyr)
library(purrr)
library(quanteda)

# Define input and output directories
input_dir <- "data/qualtrics/completed_forms"
output_dir <- "data/qualtrics/completed_forms_cleaned"

if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
  cat("Created output directory:", output_dir, "\n")
}

# Helper function to process a single CSV file.
# - file: full path to the CSV file.
# - drop_rows: vector of row indices to drop.
# - extra_filter: (optional) function to filter the long-format data.
process_file <- function(file, drop_rows, extra_filter = function(x) x) {
  dt <- fread(file)
  dt <- dt[, -c(1:17), with = FALSE]  # remove first 17 columns
  dt <- dt[-drop_rows, ]
  
  dt_long <- dt %>% 
    pivot_longer(cols = everything(), names_to = "ID", values_to = "value") %>%
    filter(!is.na(value)) %>%
    mutate(value = as.integer(value))
  
  dt_long <- extra_filter(dt_long)
  
  dt_agg <- dt_long %>% 
    group_by(ID) %>% 
    summarise(score_avg = mean(value), .groups = "drop")
  
  return(dt_agg)
}

# Get list of input files from the new directory
files <- list.files(input_dir, full.names = TRUE)

# --- Process Each Form --- 

## FORM 1:
#   Part A: use files[1], drop first 2 rows.
#   Part B: use files[2], drop first 7 rows, then filter out rows where value == 10.
form1_partA <- process_file(files[1], drop_rows = 1:2)
form1_partB <- process_file(files[2], drop_rows = 1:7, 
                            extra_filter = function(df) { filter(df, value != 10) })
form1 <- bind_rows(form1_partA, form1_partB)
saveRDS(form1, file = file.path(output_dir, "form1.rds"))

## FORM 2:
#   Use files[3], drop first 2 rows.
form2 <- process_file(files[3], drop_rows = 1:2)
saveRDS(form2, file = file.path(output_dir, "form2.rds"))

## FORM 3:
#   Use files[4], drop first 3 rows.
form3 <- process_file(files[4], drop_rows = 1:3)
saveRDS(form3, file = file.path(output_dir, "form3.rds"))

## FORM 4:
#   Use files[5], drop first 2 rows.
form4 <- process_file(files[5], drop_rows = 1:2)
saveRDS(form4, file = file.path(output_dir, "form4.rds"))

## FORM 5:
#   Part A: use files[6], drop first 2 rows.
#   Part B: use files[7], drop first 2 rows and filter out where value equals 10.
form5_partA <- process_file(files[6], drop_rows = 1:2)
form5_partB <- process_file(files[7], drop_rows = 1:2, 
                            extra_filter = function(df) { filter(df, value != 10) })
form5 <- bind_rows(form5_partA, form5_partB)
saveRDS(form5, file = file.path(output_dir, "form5.rds"))

## FORM 6:
#   Use files[8], drop first 2 rows.
form6 <- process_file(files[8], drop_rows = 1:2)
saveRDS(form6, file = file.path(output_dir, "form6.rds"))

## FORM 7:
#   Use files[9], drop first 2 rows.
form7 <- process_file(files[9], drop_rows = 1:2)
saveRDS(form7, file = file.path(output_dir, "form7.rds"))
